#Dennis Moskov, Master Thesis
#Find best variable number m with Random Forest
#RF for model fitness
#reduced model
#conversion, selectivity and yield
#using "randomForest" package

#install.packages("randomForest")
#library(randomForest)

#randomly shuffle the data
set.seed(77)                      # seed for reproducibility
DBf<-DB[sample(nrow(DB)),]

#initiate possible results
results<-rbind(c("Conversion","Selectivity","Yield"),c("X.MeOH","S.MeOH","Y.MeOH"),c(length(DBf)-2,length(DBf)-1,length(DBf)))

#loop through different outcomes
for (r in 1:3) {

#use desired outcome
useDB<-DBf[-c(1,as.numeric(results[3,-r]))]

#find best number of used variables
nv <- tuneRF(useDB[,-length(useDB)], useDB[,length(useDB)],stepFactor=1.5, improve=0.005,ntreeTry=500)
bestnv<-nv[which(grepl(min(nv[,2]), nv[,2]))]

#initiate results lists
res<-list()
res.names<-c("fitted","observed","residuals","residuals_squared")

#initiate matrices for regression and prediction values
reg<-matrix(, nrow = 10,ncol = 1)
colnames(reg) <-  "overall"
rownames(reg) <- c("Sample","RSS","TSS","MSS","R","R","adj.R","MSE","RMSE","SDEC")

	#fit a random forest
        nt<-500
        fit<-randomForest(useDB[,-length(useDB)],useDB[,length(useDB)],mtry=bestnv,importance=TRUE,ntree=nt)

        #use only important variables
        myvars<-names(useDB) %in% names(importance(fit)[(importance(fit)[,2]>sum(importance(fit)[,2])/100*0.5),2])
        nv <- tuneRF(useDB[,myvars], useDB[,length(useDB)],stepFactor=1.5, improve=0.005,ntreeTry=500)
        bestnv<-nv[which(grepl(min(nv[,2]), nv[,2]))]
        fit<-randomForest(useDB[,myvars],useDB[,length(useDB)],mtry=bestnv,importance=TRUE,ntree=nt)

        pred<-predict(fit,useDB, predict.all=FALSE)
	
	#save results for prediction 
    	res$fitted<-unname(pred)
	res$observed<-useDB[,length(useDB)]
	res$residuals<-res$observed-res$fitted
	res$residuals_squared<-(res$residuals)^2

	#save used variables 
	uvn<-varUsed(fit, by.tree=FALSE, count=FALSE)  #variable number
	uv<-varUsed(fit, by.tree=FALSE, count=TRUE)    #variable frequency
	names(uv)<-names(useDB[uvn])		       #variable names

	
	#values for model fitness for each fold
    	reg["Sample",1]<-nrow(useDB)                 			#number of datapoints in training set of the fold
    	reg["RSS",1]<-sum(res$residuals_squared)  				#Residual Sum of Squares
   	reg["TSS",1]<-sum((res$observed-mean(res$observed))^2)      	#Total Sum of Squares
   	reg["MSS",1]<-reg["TSS",1]-reg["RSS",1]                 		#Model Sum of Squares
    	reg["R",1]<-reg["MSS",1]/reg["TSS",1]                          	#coefficient of determination
    	reg["R",1]<-sqrt(reg["R",1])                           		#multiple correlation coefficient
   	reg["adj.R",1]<-1-((1-reg["R",1])*((reg["Sample",1]-1)/(reg["Sample",1]-length(uv))))                        		
										#Adjusted coefficient of determination
   	reg["MSE",1]<-sum(fit$mse)/length(fit$mse)                        	#Mean square error
   	reg["RMSE",1]<-sqrt(reg["MSE",1])                        		#Root Mean square error (residual standard deviation)
   	reg["SDEC",1]<-sqrt(reg["RSS",1]/reg["Sample",1])       		#Standard Deviation Error in Calculation


redVar<-names(importance(fit)[(importance(fit)[,2]>sum(importance(fit)[,2])/100*0.5),2])
#--------------------------------PLOT------------------------------------------------------------------------------------
#plot predicted vs. observed
x11()
plot(res$fitted,res$observed,pch=16, col="blue",xlim=c(0,1),ylim=c(0,1),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Fitted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(paste("number of trees:",nt),paste("number of random variables:",bestnv)))

x11()
plot(res$fitted,res$observed,pch=16, col="blue",xlim=c(min(0,min(res$fitted)),max(max(res$observed),max(res$fitted))),ylim=c(min(0,min(res$fitted)),max(max(res$observed),max(res$fitted))),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Fitted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(paste("number of trees:",nt),paste("number of random variables:",bestnv)))



#plot predicted vs. residuals
x11() 
plot(res$fitted,res$residuals, col="blue",xlim=c(0,1), ylim=c(-1,1),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab="Residuals",main=paste("Fitted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)

x11() 
plot(res$fitted,res$residuals, col="blue",xlim=c(min(res$fitted),max(res$fitted)), ylim=c(min(res$residuals),max(res$residuals)),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab="Residuals",main=paste("Fitted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)

#plot residual density
x11()
plot(density(res$residuals),xlab="Residuals", ylab="Density",main=paste("Density Plot of Residuals for",results[1,r]))


View(reg)

#-----------SAVE-----------------------------------------------------------------------------

#regression and prediction results
write.csv(reg, file =paste(results[1,r]," regression_values.csv"))
write.csv(predic, file =paste(results[1,r]," prediction_values.csv"))
write.csv(res, file =paste(results[1,r]," prediction_results.csv"), row.names=FALSE)

#regression and prediction results
write.csv(reg, file =paste(results[1,r]," regression_values.csv"))
write.csv(predic, file =paste(results[1,r]," prediction_values.csv"))
write.csv(res, file =paste(results[1,r]," prediction_results.csv"), row.names=FALSE)
write.csv(tvar,file=paste(results[1,r]," Variables_Tree_Construction.csv"))
write.csv(tvar,file="Variables_Forest_Construction.csv")

#plot predicted vs. observed
png(filename=paste(results[1,r]," fittedVSobs full.png"))
par(new=TRUE, pch=16)
plot(res$fitted,res$observed,pch=16, col="blue",xlim=c(0,1),ylim=c(0,1),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Fitted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(paste("number of trees:",nt),paste("number of random variables:",bestnv)))
dev.off()

png(filename=paste(results[1,r]," fittedVSobs croped.png"))
par(new=TRUE, pch=16)
plot(res$fitted,res$observed,pch=16, col="blue",xlim=c(min(0,min(res$fitted)),max(max(res$observed),max(res$fitted))),ylim=c(min(0,min(res$fitted)),max(max(res$observed),max(res$fitted))),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab=paste("Observed MeOH",results[1,r] ),main=paste("Fitted vs. Observed MeOH",results[1,r]))
abline(0,1)
legend("topleft", legend = c(paste("number of trees:",nt),paste("number of random variables:",bestnv)))
dev.off()


#plot predicted vs. residuals
png(filename=paste(results[1,r]," fittedVSres full.png"))
plot(res$fitted,res$residuals, col="blue",xlim=c(0,1),ylim=c(-1,1),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab="Residuals",main=paste("Fitted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)
dev.off()

png(filename=paste(results[1,r]," fittedVSres croped.png"))
plot(res$fitted,res$residuals, col="blue",xlim=c(min(res$fitted),max(res$fitted)), ylim=c(min(res$residuals),max(res$residuals)),xaxs="i",yaxs="i",xlab=paste("Fitted MeOH",results[1,r]), ylab="Residuals",main=paste("Fitted MeOH",results[1,r],"vs. Residuals"),pch=16)
abline(h=0)
dev.off()

write.csv(redVar, file = paste(results[1,r]," reduction.csv"))
}







